###########################################################################
###################         Minor Literature          #####################
###########################################################################
###########################################################################

#######   Summary of Workflow   #####

#Prior to this script
# - normalize spelling of all documents to US English
# - run all texts through the Litbank Tagger by D. Bamman et al

#Included in thes scripts
# - extract NER tagged locations from Litbank tagged texts
# - prepare gazetter of place names based on geonames.org database
# - match NER tagged locations with inferred national ID (i.e. give every location a predicted national context)
# - transform national IDs to in- and out-country labels based on the novel's country of origin
# - extract actor/event/place bigrams from wikipedia "history of [country x]" pages
# - statistical analyses of the distributions of place and event names in novels


setwd("~/Research/Minor Literature")

#ingest metadata
meta<-read.csv("min_meta_master.csv")

############## Prepare NER Data for Analysis ##############

#this script goes through litbank tagged tokens tables 
#and subsets by location codes GPE and LOC
#it outputs a table of works and their NER tagged locations
#it also outputs the token location of the location to understand spatial timing

setwd("~/Research/Minor Literature/min_LitBank_Data")

#set corpus
#corpus<-("French")
#corpus<-("German")
#corpus<-("Small_Europe")
corpus<-("US")

#get filenames
filenames<-list.files(corpus)

#setwd
setwd(paste("~/Research/Minor Literature/min_LitBank_Data/", corpus, sep=""))

geo.df<-NULL
for (i in 1:length(filenames)){
  print(i)
  a<-read.csv(filenames[i], sep="\t", quote = "", stringsAsFactors = F)
  #subset on geo-political entities 
  a.gpe<-a[which(a$label == "GPE"),]
  #subset on locations
  a.loc<-a[which(a$label == "LOC"),]
  #combine
  a.all<-rbind(a.gpe, a.loc)
  #remove "the"
  a.all$text<-gsub("the ", "", a.all$text, ignore.case = T)
  #remove rows where there are no capitalized words
  a.all<-a.all[grepl('[A-Z]', a.all$text),]
  #remove blank lines
  a.all<-a.all[a.all$text != "",]
  work<-filenames[i]
  work.abbrev<-gsub(".tokens","",work)
  iso<-as.character(meta$Country_ISO[which(work.abbrev == gsub(".txt", "",meta$ID))])
  token<-a.all$tokenId
  token.total<-max(a.all$tokenId)
  location<-a.all$text
  temp.df<-data.frame(corpus, iso, work, token, token.total, location)
  geo.df<-rbind(geo.df, temp.df)
}

#geo.all<-NULL
geo.all<-rbind(geo.all, geo.df)

setwd("~/Research/Minor Literature")
write.csv(geo.all, file="min_geoTags_all.csv", row.names = F)




########### Subset all geo tags by table of place names by country ###########

#this script ingests and cleans a gazetteer of place names drawn from 
#geonames.org
#https://download.geonames.org/export/dump/
#we use the file allCountries.txt

setwd("~/Research/Minor Literature")

#ingest our data's geo tags
geo.all<-read.csv("min_geoTags_all.csv", stringsAsFactors = F)

#ingest list of countries
our.countries<-read.csv("min_countries_ourData.csv", header=T)

#### ingest geo data #####

#codes
#V9 = country code
#V15 = population
#V7 = area type (A = country, state, region, P = city, H = water, L = Parks, T = Mountains)

#ingest all names
a<-read.csv("allCountries.txt", header = F, stringsAsFactors = F, sep="\t", quote="")

#continents
continents<-c("Europe", "South America", "North America", "Africa", "Antarctica", "Asia", "Middle East", "Far East", "Near East", "Central America")

#countries
#normalized by hand
countries<-read.csv("min_countries_allNames.csv", stringsAsFactors = F)

#states
states<-a[a$V8 == "ADM1",]
states<-states[states$V15 > 10000,]
#fix Canadian provinces
can<-states[which(states$V9 == "CA"),]
can$V2<-gsub("/Nouveau-Brunswick","",can$V2)
can$V2<-gsub(" and Labrador","",can$V2)
can$V2<-gsub("é","e",can$V2)
#keep only those states/regions that belong to our countries of interest
#states<-states[states$V9 %in% as.character(our.countries$iso),]
#add Canada
states<-rbind(states, can)

#add large regions
rgn<-a[which(a$V8 == "RGN"),]
rgn<-rgn[rgn$V2 %in% c("Siberia", "New England"),]
rgn<-rgn[-1,]
states<-rbind(states, rgn)

#subset by cities with a population greater than 10K
cities<-a[a$V7 == "P",]
cities<-cities[cities$V15 > 10000,]

#transform spelling
cities$V2[which(cities$V2 == "Genève")]<-c("Geneva")

#subset by regional names (water, mountains, parks, continents)
regions<-a[a$V7 %in% c("H", "L", "T", "S"),]
region.set<-c("LK", "LKS", "SEA", "STM", "PRK", "GNL", "BDG", "DSRT", "MT", "MTS")
regions<-regions[regions$V8 %in% region.set,]
#subset by country codes of interest
#regions<-regions[regions$V9 %in% as.character(our.countries$iso),]
#remove duplicates w other categories
regions<-regions[which(!regions$V2 %in% cities$V2),]
regions<-regions[which(!regions$V2 %in% states$V2),]
regions<-regions[which(!regions$V2 %in% countries$normal),]
#remove problems
regions<-regions[-which(regions$V2 == "Amazon"),]
#add Amazon river
regions$V2[regions$V2 == "Amazon River"]<-c("Amazon")

#add specially resolved names that are not included in the dictionary
custom1<-read.csv("min_geo_custom1.csv", stringsAsFactors = F)

#add specially resolved names that override our matching criteria (manually adjusted)
custom2<-read.csv("min_geo_custom2.csv", stringsAsFactors = F)
#remove duplicates
custom2<-custom2[order(custom2$name),]
custom2<-custom2[!duplicated(custom2$name),]



##########  MATCHING NOVEL PLACE NAMES WITH GEO TAGS    #############

#geo.all == all place names derived from litbank tagger
#geo.df == all place names that match geonames data
#ann.df == all matched place names resolved to country IDs using the heuristics below

#subset first by all names
#test without custom lists
geo.all.vector.test<-c(continents, regions$V2, cities$V2, states$V2, regions$V3, cities$V3, states$V3, countries$normal)
geo.test<-geo.all[which(geo.all$location %in% geo.all.vector.test),]

geo.custom2<-c(custom2$name)
geo.test<-geo.all[which(geo.all$location %in% geo.custom2),]

#run normally
geo.all.vector<-c(continents, regions$V2, cities$V2, states$V2, regions$V3, cities$V3, states$V3, countries$normal, custom1$place, custom2$name)
geo.df<-geo.all[which(geo.all$location %in% geo.all.vector),]

#attach country code for each place
#resolve nationality according to the following hierarchy:
#continent
#country
#states
#cities
#regions
#if multiple matches, choose ID with highest population
#if no population, then write "NA" for later inspection 

#create new table
ann.df<-geo.df
#create empty country vector
ann.df$country<-vector(length=nrow(ann.df))
#run for every row
for (j in 1:nrow(ann.df)){

  print(j)
  
  x<-as.character(ann.df$location[j])
  
  #start with custom lists
  #if x is in custom list1, use that
  if (x %in% custom1$place){
    country<-custom1$iso[which(custom1$place == x)]
  } else if (x %in% custom2$name){
    country<-custom2$iso[which(custom2$name == x)]
  #if x is in continents, use that
  } else if (x %in% continents){
    country<-x
  #if x is in countries use that
  } else if (x %in% countries$normal){
    country<-countries$V9[countries$normal == x]
  #if x is in states use that
  } else if (x %in% states$V2 | x %in% states$V3){
    country<-states$V9[which(states$V2 == x | states$V3 == x)][1]
  #if x is in cities use that
  } else if (x %in% cities$V2 | x %in% cities$V3){
    country<-cities$V9[which(cities$V2 == x | cities$V3 == x)]
    #if there are multiple matches
    if (length(country) > 1){
      sub<-cities[which(cities$V2 == x | cities$V3 == x),]
      #prefer city with highest population; if same city take first
      sub<-sub[which(sub$V15 == max(sub$V15)),]
      country<-sub$V9[1]
    } else {
      country<-country
    }
  #if no match to the above then check regions  
  } else if (x %in% regions$V2 | x %in% regions$V3){
    country<-regions$V9[which(regions$V2 == x | regions$V3 == x)]
    #if there are multiple regions matched
    if (length(country) > 1){
      #and they are all the same region
      if (nlevels(factor(country)) == 1){
        #take the first match
        country<-country[1]
        #otherwise if there are multiple nations
      } else if (nlevels(factor(country)) > 1){
        sub<-regions[regions$V2 == x,]
        #first check if one of the locations is in the novel's home country. prefer this.
        if (ann.df$iso[j] %in% sub$V9){
          country<-sub$V9[sub$V9 == ann.df$iso[j]][1]
        } else {
          #otherwise take the first country. This will resolve as "foreign" in our analysis.
          country<-sub$V9[sub$V9 == ann.df$iso[j]][1]
        }
      }
    }
  }
  #return(country)
  ann.df$country[j]<-country
}
write.csv(ann.df, file="min_geoTags_Annotated_All.csv", row.names = F)



########### TRANSFORM country mentions into place types ###############

#############    Method 1: Corpus Level Comparisons      ##############
#get bootstrap estimate for entire corpus

boot.df<-NULL
for (j in 1:nlevels(factor(ann.df$corpus))){
  print(j)
  #subset by corpus
  sub<-ann.df[ann.df$corpus == levels(factor(ann.df$corpus))[j],]
  #run bootstrap sampling
  for (i in 1:1000){
    #get sample w replacement
    boot<-sub[sample(nrow(sub), nrow(sub), replace = T),]
    #calculate scores
    national<-nrow(boot[which(boot$country == boot$iso[1]),])/nrow(boot)
    international<-nrow(boot[which(boot$country != boot$iso[1]),])/nrow(boot)
    ratio<-national/international
    corpus<-boot$corpus[1]
    temp.df<-data.frame(corpus, ratio, national, international)
    boot.df<-rbind(boot.df, temp.df)
  }
}

####### Visualize ######

boxplot(national ~ corpus, data=boot.df)
ggplot(boot.df, aes(x = ratio, y = corpus, fill=corpus)) +
  geom_density_ridges(scale = 1) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_fill_brewer(palette = 4) +
  #xlim(0, max(output.df$ratio)) +
  theme_ridges()


################## National Mentions Calculation #######################

#here we calculate the frequency of national mentions by corpus
#"national mention" == exclusive invocation of national frame (German, Germany or France/French, etc)
#we then compare the odds ratio of seeing national mentions by corpus

setwd("~/Research/Minor Literature")

meta<-read.csv("min_meta_master.csv")

setwd("~/Research/Minor Literature/min_LitBank_Data")

#set corpus
#corpus<-("French")
#corpus<-("German")
#corpus<-("Small_Europe")
corpus<-("US")


#get filenames
filenames<-list.files(corpus)

#change wd
setwd(paste("~/Research/Minor Literature/min_LitBank_Data/", corpus, sep=""))

#ingest bookNLP tables and calculate national mentions
#national.v<-vector()
nat.df<-NULL
for (i in 1:length(filenames)){
  #read in novel
  a<-read.csv(filenames[i], sep="\t", quote="", stringsAsFactors = F)
  #subset metadata
  meta.sub<-meta[gsub(".txt","",meta$ID) == gsub(".tokens", "",filenames[i]),]
  #create vector of national names for that novel
  nat_names<-unlist(strsplit(as.character(meta.sub$National_Names), "_"))
  #subset a by nat names
  a.sub<-a[tolower(a$lemma) %in% nat_names,]
  
  #for US add bigram matching
  a.sub2<-a[grep("united states", tolower(a$text)),]
  a.sub<-rbind(a.sub2, a.sub)
  
  #for novel specific counts
  work<-filenames[i]
  nat.count<-nrow(a.sub)
  word.count<-meta.sub$wordCount
  temp.df<-data.frame(corpus, work, nat.count, word.count)
  nat.df<-rbind(nat.df, temp.df)
}

#nat.all<-NULL
nat.all<-rbind(nat.all, nat.df)

#make contingency table
fr.nat<-sum(nat.all$nat.count[nat.all$corpus == "French"])
fr.all<-sum(nat.all$word.count[nat.all$corpus == "French"])
de.nat<-sum(nat.all$nat.count[nat.all$corpus == "German"])
de.all<-sum(nat.all$word.count[nat.all$corpus == "German"])
us.nat<-sum(nat.all$nat.count[nat.all$corpus == "US"])
us.all<-sum(nat.all$word.count[nat.all$corpus == "US"])
min.nat<-sum(nat.all$nat.count[nat.all$corpus == "Small_Europe"])
min.all<-sum(nat.all$word.count[nat.all$corpus == "Small_Europe"])

nat.df<-data.frame(c(fr.nat, fr.all),c(de.nat, de.all), c(us.nat, us.all), c(min.nat, min.all))

colnames(nat.df)<-c("france", "german", "us", "minor")
nat.df[1,]/nat.df[2,]
chisq.test(nat.df)
#subset pairs
nat.sub<-nat.df[,colnames(nat.df) %in% c("france", "minor")]
fisher.test(nat.sub)

#save table
setwd("~/Research/Minor Literature")
write.csv(nat.df, file="min_geo_nationalMentions_Aggregate.csv", row.names = F)

### add to final table below ####
nat.all$work.sub<-gsub(".tokens", "", nat.all$work)
nat.all<-nat.all[order(nat.all$work.sub),]
output.df<-output.df[order(output.df$work),]
which(output.df$work != nat.all$work.sub)
output.df$nat.ref<-nat.all$nat.count


########## Method 2: Per novel rates ###########

library(DescTools)

#takes as input the output of ann.df

#for each novel, get:
#placeCount
#national
#international
#wordCount

meta<-read.csv("min_meta_master.csv")

ann.df<-read.csv("min_geoTags_Annotated_All.csv", stringsAsFactors = F)

#remove NAs
ann.df<-ann.df[complete.cases(ann.df$country),]

output.df<-NULL
for (i in 1:nlevels(factor(ann.df$work))){
  
  #subset by work
  sub<-ann.df[ann.df$work == levels(factor(ann.df$work))[i],]
  
  #metadata
  work<-gsub(".tokens","",levels(factor(ann.df$work))[i])
  wordCount<-meta$wordCount[which(gsub(".txt","",meta$ID) == work)]
  corpus<-sub$corpus[1]
  country<-sub$iso[1]
  
  #overall place counts
  placeCount<-nrow(sub)
  
  #national and international counts
  #use grep to capture places w multiple countries
  national<-nrow(sub[grep(sub$iso[1], sub$country),])
  international<-nrow(sub)-national
  
  #international entropy
  inter.entropy<-Entropy(table(factor(sub$country)), base=exp(1))

  #intranational entropy
  #create table of local places
  nat.df<-sub[sub$country == sub$iso[1],]
  #create vector of national references
  nat.v<-unlist(strsplit(as.character(meta$National_Names[which(gsub(".txt","",meta$ID) == work)]), "_"))
  #remove references to nation
  nat.df<-nat.df[!tolower(nat.df$location) %in% nat.v,]
  #calculate intranational entropy
  intra.entropy<-Entropy(table(nat.df$location), base=exp(1))
  
  #store as table
  temp.df<-data.frame(work, corpus, country, wordCount, placeCount, national, international, inter.entropy, intra.entropy)
  output.df<-rbind(output.df, temp.df)
}

#add variables
#national frequency
output.df$nat.freq<-output.df$national/output.df$wordCount
#international frequency
output.df$int.freq<-output.df$international/output.df$wordCount
#place frequency
output.df$place.freq<-(output.df$national+output.df$international)/output.df$wordCount


######################      WIKI  MODELING         ##########################

#Ingest a novel
#Create dictionary of links from national wikipedia page (history of) 
#Match novel places (GPE)
#Match novel 2-4 grams

library(tm)

setwd("~/Research/Minor Literature")

#ingest metadata
meta<-read.csv("min_meta_master.csv")

#Ingest Novel bookNLP tables for each corpus

#set corpus
#corpus<-("French")
#corpus<-("German")
#corpus<-("US")
corpus<-("Small_Europe")

setwd("~/Research/Minor Literature/min_LitBank_Data")

#get novel filenames
filenames<-list.files(corpus)

#wiki filenames
setwd("~/Research/Minor Literature/")
wiki.files<-list.files("min_Wiki_HTML")


#ingest bookNLP tables and wiki tables and compare similarity
wiki.df<-NULL
for (i in 1:length(filenames)){
  print(i)
  #set wd for novels
  setwd(paste("~/Research/Minor Literature/min_LitBank_Data/", corpus, sep=""))
  #read in novel
  a1<-read.csv(filenames[i], sep="\t", quote="", stringsAsFactors = F)
  
  #set wd for wiki
  setwd("~/Research/Minor Literature/min_Wiki_HTML")
  
  #subset metadata to identify country ID
  meta.sub<-meta[gsub(".txt","",meta$ID) == gsub(".tokens", "",filenames[i]),]
  
  country<-as.character(meta.sub$Country_ISO)
  
  #read in wiki text
  w<-scan(paste(country, ".txt", sep=""), what="character", sep="\n", quote="", quiet=T)
  #keep only linked text
  w1<-w[grep("title=", w)]
  w1<-sub(".*title", "", w1)
  w1<-sub("^[^>]*", "", w1)
  w1<-gsub("[[:punct:]]", "",w1)
  w1<-gsub('.{1}$', '', w1)
  w1<-gsub("[[:digit:]]", "", w1)
  w1<-w1[w1 != ""]
  
  #create list of places in novel
  a2<-a1[a1$label %in% c("GPE"),]
  a2<-a2[a2$text != "",]
  #keep only text that has at least one capital word
  a2<-a2[grep("[A-Z]",a2$text),]
  
  #match places with wiki list of links
  #gpe<-nrow(a2[a2$text %in% w1,])
  
  #match 2-4grams
  #collapse word vector from NLP table
  text.whole<-paste(a1$originalWord, collapse=" ") # collapse into single chunk
  #turn into corpus object for tm package
  corpus1<-VCorpus(VectorSource(text.whole))
  #ngram function
  BigramTokenizer23 <- function(x)unlist(lapply(ngrams(words(x), 2:3), paste, collapse = " "), use.names = FALSE)
  #make ngram table
  dtm.ngram <- DocumentTermMatrix(corpus1, control=list(tokenize = BigramTokenizer23, wordLengths=c(1,Inf)))
  #match colnames
  hist<-length(which(colnames(dtm.ngram) %in% tolower(w1)))
  
  #get total frequency
  #entity.score<-(gpe+hist)/nrow(a1[-grep("[[:punct:]]", a1$lemma),])
  entity.score<-hist
  
  #make table
  work<-filenames[i]
  temp.df<-data.frame(corpus, work, entity.score)
  wiki.df<-rbind(wiki.df, temp.df)
}

#wiki.all<-wiki.df
#wiki.all<-rbind(wiki.all, wiki.df)
#wiki.nogeo<-wiki.df
wiki.nogeo<-rbind(wiki.nogeo, wiki.df)

boxplot(entity.score ~ corpus, data = wiki.nogeo)
kruskal.test(entity.score ~ corpus, data = wiki.all)
wilcox.test(wiki.all$entity.score[wiki.all$corpus == "French"], wiki.all$entity.score[wiki.all$corpus == "Small_Europe"])
tapply(wiki.nogeo$entity.score, wiki.nogeo$corpus, median)

#combine with national mentions
output.df$nationalism.quotient<-(output.df$national+output.df$wiki.score)/output.df$wordCount

########### write to final table ################

write.csv(output.df, file="min_geo_finalTable.csv", row.names = F)


#################### Adjusting Values Using Messam et al's Method #####################
################### to account for differential levels of error in place detection ############

#this section addresses the validation of the country-prediction process
#the goal is to assess whether different corpora exhibit different levels of accuracy
#wrt to the NER process
#the scripts take as input annotated tables with "ground-truth" labels
#and generate accuracy scores based on those labels
#it uses bootstrap sampling to get estimated levels of sensitivity and specificity
#due to the low number of ground truth annotations

#ingest annotated tables and bind into single table

#subset by NER annotations and human annotations

#TP = all annotated rows where NER and RA agree
#FP = all annotated rows where NER and RA don't agree
#FN = all annotated rows by RA where there is no NER annotation
#TN = all non-annotated rows by NER and RA

#set working folder
folder<-c("min_Validation_MD_03_ANN_AP")

#setwd
setwd(paste("~/Research/Minor Literature/min_Validation/", folder, sep = ""))

#get filenames
filenames<-list.files(".")

#ingest and combine all tables
#df0 <- lapply(filenames, read.csv)
#df1<- do.call(rbind.data.frame, df0)

#ingest and append filename and RA name as separate column
#repeat for each subdirectory of validated passages
df1<-NULL
for (i in 1:length(filenames)){
  a<-read.csv(filenames[i])
  a$file.id<-filenames[i]
  a$ra.id<-folder
  df1<-rbind(df1, a)
}

#df<-NULL
df<-rbind(df, df1)

#create bigram of tokenID and originalWord
df$bigram<-paste(df$tokenId, df$originalWord, sep="_")

#establish codes to keep
#remove first code which equals non-annotation
levels(factor(df$annotated_iso))
levels(factor(df$RA_iso))
keep1<-levels(factor(df$annotated_iso))[2:length(levels(factor(df$annotated_iso)))]
keep2<-levels(factor(df$RA_iso))[2:length(levels(factor(df$RA_iso)))]
#subset by matching on either column
#i.e. keep a row of either the NER or RA annotated it
df.ann<-df[which(df$annotated_iso %in% keep1 | df$RA_iso %in% keep2),]
#write.csv(df.ann, file="min_Validation_ErrorTable.csv", row.names = F)

test<-read.csv("min_Validation_ErrorTable.csv")
test2<-read.csv("min_Validation_All.csv")
nrow(test)/nrow(test2)

#### RUN 3x - for ALL, INTERNATIONAL, NATIONAL ########

#ingest error table
df.ann<-read.csv("min_Validation_ErrorTable.csv", stringsAsFactors = F)

#transform NAs into 0s
df.ann$annotated_iso[is.na(df.ann$annotated_iso)] <- c("0")

#create bookID
df.ann$book<-gsub("_Europe","",df.ann$file.id)
df.ann$book<-gsub("_.*","",df.ann$book)
df.ann$book<-gsub(".csv", "", df.ann$book)

df.ann.all<-df.ann

#### ALL ####
df.ann<-df.ann.all

### INTERNATIONAL ###
df.ann<-df.ann.all[which(df.ann.all$RA_iso != df.ann.all$novel_iso),]

### NATIONAL ###
df.ann<-df.ann.all[which(as.character(df.ann.all$RA_iso) == as.character(df.ann.all$novel_iso)),]

### observe distribution of errors across books ####

#all
#sort(table(df.ann$book))
#by corpus
#sort(table(df.ann$book[df.ann$corpus == "US"]))

### adjust tables by removing most extreme books -- i.e. estimated errors are overweighted by a single document ####

#1. remove most extreme document from each corpus
#US
df.ann<-df.ann[!df.ann$book %in% names(which(table(df.ann$book[df.ann$corpus == "US"]) == max(sort(table(df.ann$book[df.ann$corpus == "US"]))))),]
#German
df.ann<-df.ann[!df.ann$book %in% names(which(table(df.ann$book[df.ann$corpus == "German"]) == max(sort(table(df.ann$book[df.ann$corpus == "German"]))))),]
#French
df.ann<-df.ann[!df.ann$book %in% names(which(table(df.ann$book[df.ann$corpus == "French"]) == max(sort(table(df.ann$book[df.ann$corpus == "French"]))))),]
#Small Europe
df.ann<-df.ann[!df.ann$book %in% names(which(table(df.ann$book[df.ann$corpus == "Small_Europe"]) == max(sort(table(df.ann$book[df.ann$corpus == "Small_Europe"]))))),]

#true positives
#when RA and NER agree
tp.df<-df.ann[which(df.ann$annotated_iso == df.ann$RA_iso),]

#false positives
#when NER annotates as a place and RA does not match
fp.df<-df.ann[which(df.ann$annotated_iso != df.ann$RA_iso),]
fp.df<-fp.df[which(fp.df$annotated_iso != "0"),]
#do not include if RA annotated a different *foreign* country, i.e. both will register
#as not-national. So drop rows where there are both annotated but *neither* == host country 
fp.df2<-NULL
for (j in 1:nrow(fp.df)){
  sub<-fp.df[j,]
  if (sub$annotated_iso == sub$novel_iso | sub$RA_iso == sub$novel_iso){
    fp.df2<-rbind(fp.df2, sub)
  }
}
#then add non-included rows to TP table
fp.df3<-fp.df[!fp.df$tokenId %in% fp.df2$tokenId,]
tp.df<-rbind(tp.df, fp.df3)

## false negatives
#all RA annotations of place with no NER annotation
fn.df<-df.ann[which(df.ann$annotated_iso != df.ann$RA_iso),]
fn.df<-fn.df[which(fn.df$annotated_iso == "0"),]

### true negatives
#all non-annotated rows by either NER or RA
#tn.df<-df[which(!df$bigram %in% df.ann$bigram),]


#final scores
fp<-nrow(fp.df2)
tp<-nrow(tp.df)
fn<-nrow(fn.df)
#tn<-nrow(tn.df)

#precision and recall for whole data set
precision<-tp/(tp+fp)
recall<-tp/(tp+fn)

#for each corpus
precision.table<-table(tp.df$corpus)/(table(tp.df$corpus)+table(fp.df2$corpus))
recall.table<-table(tp.df$corpus)/(table(tp.df$corpus)+table(fn.df$corpus))

#store recall in data.frame
#recall.df<-data.frame(recall.table)
#recall.df<-rbind(recall.df, data.frame(Var1=c("All"), Freq=recall))
recall.df<-cbind(recall.df,append(unname(recall.table), recall))
write.csv(recall.df, file="min_Validation_Recall_Table.csv", row.names = F)

#calculate sensitivity and specificity by corpus
specificity<-table(tn.df$corpus)/(table(tn.df$corpus)+table(fp.df2$corpus))
sensitivity<-table(tp.df$corpus)/(table(tp.df$corpus)+table(fn.df$corpus))

#### bootstrap estimation of true error #####

### errors are not equally distributed across books
### thus some books are accounting for a high degree of the error in our validation sample
### this is especially problematic for the multilingual corpus (Small Europe)
### where 1-2 languages / examples may be driving most of the error
### this method derives a confidence interval around our observed error to use
### in a Bayesian analysis of true prevalence

boot.df<-NULL
for (i in 1:1000){
  print(i)
  test.df<-NULL
  #for each corpus, bootstrap sample
  for (j in 1:4){
    sub<-df.ann[df.ann$corpus == levels(factor(df.ann$corpus))[j],]
    sub2<-sub[sample(nrow(sub), nrow(sub), replace = T),]
    test.df<-rbind(test.df, sub2)
  }

  #tps
  tp.df<-test.df[which(test.df$annotated_iso == test.df$RA_iso),]
  
  ## false negatives
  #all RA annotations of place with no NER annotation
  fn.df<-test.df[which(test.df$annotated_iso != test.df$RA_iso),]
  fn.df<-fn.df[which(fn.df$annotated_iso == "0"),]
  
  #calculate sensitivity and specificity by corpus
  #specificity<-table(tn.df$corpus)/(table(tn.df$corpus)+table(fp.df2$corpus))
  sensitivity<-table(tp.df$corpus)/(table(tp.df$corpus)+table(fn.df$corpus))
  
  temp.df<-data.frame(sensitivity)
  boot.df<-rbind(boot.df, temp.df)
}

boxplot(Freq~Var1, data=boot.df)

#boot.df.nat<-boot.df
#boot.df.int<-boot.df
boot.df.all<-boot.df


######### Use Bayesian adjustment method ###########
#McV. Messam, L., Branscum, A., Collins, M., & Gardner, I. (2008). 
#Frequentist and Bayesian approaches to prevalence estimation using examples from Johne's disease. 
#Animal Health Research Reviews, 9(1), 1-23. doi:10.1017/S1466252307001314
library(rjags)
library(prevalence)

####### Corpus Level Estimation ##########

#generate table of estimated true prevalence with confidence intervals
#run 2 times for national and all

prev.df<-NULL
for (i in 1:nlevels(factor(boot.df$Var1))){

#get upper and lower bounds for sensitivity estimate by country
lower<-quantile(boot.df$Freq[boot.df$Var1 == levels(factor(boot.df$Var1))[i]], probs = c(0.1))[[1]]
upper<-quantile(boot.df$Freq[boot.df$Var1 == levels(factor(boot.df$Var1))[i]], probs = c(0.9))[[1]]

#get estimated mean sensitivity
mean.freq<-mean(boot.df$Freq[boot.df$Var1 == levels(factor(boot.df$Var1))[i]])

#get number of positive samples
x<-sum(output.df$national[output.df$corpus == levels(factor(output.df$corpus))[i]])
#total sample size
n<-sum(output.df$wordCount[output.df$corpus == levels(factor(output.df$corpus))[i]])

#first get estimated true prevalence using modified Rogan and Gladen
#True Prevalence	  =  	(Apparent Prevalence + (Specificity − 1))/(Specificity + (Sensitivity − 1))
#here just divide by sensitivity
#(x/n)/mean.freq

### true prevalence estimate using Bayesian modeling ####

#model sensitivity as a distribution based on validation tables
SE<-list(dist = "beta-expert", mean = mean.freq, lower = lower, upper=upper, p = 0.95)
myModel<-truePrev(x = x, n = n, SE = SE, SP = 1)

#get estimated true prevalence
actual.prev<-x/n
true.prev<-mean(myModel@mcmc$TP[[1]])
#get confidence intervals
tp.upper<-quantile(myModel@mcmc$TP[[1]], probs = c(0.025, 0.975))[[2]]
tp.lower<-quantile(myModel@mcmc$TP[[1]], probs = c(0.025, 0.975))[[1]]
#BGR statistic (fit)
BGR<-myModel@diagnostics$BGR[[2]]
#make data frame
corpus<-levels(factor(boot.df$Var1))[i]
temp.df<-data.frame(corpus, actual.prev, BGR, true.prev, tp.lower, tp.upper)
prev.df<-rbind(prev.df, temp.df)
}

#inspect traditional confidence intervals if validation error is assumed to be exact
n.v<-tapply(output.df$wordCount, output.df$corpus, sum)
propCI((n.v*prev.df$true.prev),n.v, method = "wald")

#diagnostics
#plot(myModel)
#myModel@diagnostics$DIC

#plot
library(ggplot2)
ggplot(prev.df, aes(y=true.prev, x=reorder(corpus, true.prev))) + 
  theme_bw() +
  geom_point(size=3)+
  #geom_pointrange(aes(ymin=tp.lower, ymax=tp.upper))
  geom_errorbar(aes(ymin=tp.lower, ymax=tp.upper), width=.1) +
  scale_x_discrete(labels=c("Small_Europe" = "Minor", "German" = "Germany",
                          "French" = "France", "US" = "US")) +
  ggtitle("Esimated Prevalence of National Mentions\n(N=200)")+
  ggtitle("Esimated Prevalence of All Place Mentions\n(N=200)")+
  xlab("")+
  ylab("Frequency")

####### Document Level Estimation ###########

### adjust by dividing by sensitivity based on bootstrap estimates
sensitivity<-tapply(boot.df$Freq, boot.df$Var1, mean)

output.df$nat.adj<-vector(mode="numeric",length=nrow(output.df))
for (i in 1:nrow(output.df)){
  #for each row multiply the observed count by adjusted count based on that country's accuracy  
  #sp<-unname(specificity[names(specificity) == output.df$corpus[i]])
  se<-unname(sensitivity[names(sensitivity) == output.df$corpus[i]])
  myModel<-truePrev(x = output.df$national[i], n = output.df$wordCount[i], SE = se, SP = 1)
  output.df$nat.adj[i]<-mean(myModel@mcmc$TP[[1]])
}

output.df$int.adj<-vector(mode="numeric",length=nrow(output.df))
for (i in 1:nrow(output.df)){
  #for each row multiply the observed count by adjusted count based on that country's accuracy  
  #sp<-unname(specificity[names(specificity) == output.df$corpus[i]])
  se<-unname(sensitivity[names(sensitivity) == output.df$corpus[i]])
  myModel<-truePrev(x = output.df$international[i], n = output.df$wordCount[i], SE = se, SP = 1)
  output.df$int.adj[i]<-mean(myModel@mcmc$TP[[1]])
}


#create adjusted nationalism quotient that uses the adjusted national count
output.df$nationalism.quotient.adj<-output.df$nat.adj+((output.df$wiki.score+output.df$nat.ref)/output.df$wordCount)



##############################################################################
############################     Analyze Results     #########################
##############################################################################


setwd("~/Research/Minor Literature")
output.df<-read.csv("min_geo_finalTable.csv")

###################     Corpus Level Effects      ######################

###### National references ########
fr.nat<-sum(output.df$nat.ref[output.df$corpus == "French"])
fr.all<-sum(output.df$wordCount[output.df$corpus == "French"])
de.nat<-sum(output.df$nat.ref[output.df$corpus == "German"])
de.all<-sum(output.df$wordCount[output.df$corpus == "German"])
us.nat<-sum(output.df$nat.ref[output.df$corpus == "US"])
us.all<-sum(output.df$wordCount[output.df$corpus == "US"])
min.nat<-sum(output.df$nat.ref[output.df$corpus == "Small_Europe"])
min.all<-sum(output.df$wordCount[output.df$corpus == "Small_Europe"])

nat.df<-data.frame(c(fr.nat, fr.all),c(de.nat, de.all), c(us.nat, us.all), c(min.nat, min.all))

colnames(nat.df)<-c("france", "german", "us", "minor")
nat.df[1,]/nat.df[2,]
chisq.test(nat.df)
#subset pairs
nat.sub<-nat.df[,colnames(nat.df) %in% c("france", "minor")]
fisher.test(nat.sub)
write.csv(nat.df, file="min_Model1_Table.csv", row.names = T)

######### Wiki References #########

wiki.rate<-tapply(output.df$wiki.score, output.df$corpus, sum)
total.rate<-tapply(output.df$wordCount, output.df$corpus, sum)
wiki.df<-t(data.frame(unname(wiki.rate), unname(total.rate)))
colnames(wiki.df)<-names(wiki.rate)
row.names(wiki.df)<-c("wiki count", "all tokens")
chisq.test(wiki.df)
wiki.df[1,]/wiki.df[2,]
wiki.sub<-wiki.df[,colnames(wiki.df) %in% c("US", "Small_Europe")]
fisher.test(wiki.sub)
write.csv(wiki.df, file="min_Model2_Table.csv")


########  Model 2 #########

######### Geo place mentions overall #################
##see min_Validation_TruePrevalenceTable_AllGeo.csv

######### Geo place mentions - National ##############
##see min_Validation_TruePrevalenceTable_National.csv


############### Document Level ##############
library(ggplot2)
ggplot(output.df, aes(x=reorder(corpus, nationalism.quotient.adj), y = nationalism.quotient.adj))+
  theme_bw() +
  geom_boxplot(outlier.shape = NA) +
  scale_x_discrete(labels=c("Small_Europe" = "Minor", "German" = "Germany",
                            "French" = "France", "US" = "US")) +
  ggtitle("Nationalism Quotient (N=200)")+
  #ggtitle("Esimated Prevalence of All Place Mentions\n(N=200)")+
  ylim(c(0,.0061)) +
  xlab("")+
  ylab("Frequency")

tapply(output.df$nationalism.quotient.adj, output.df$corpus, median)

wilcox.test(output.df$nationalism.quotient.adj[output.df$corpus == "German"], output.df$nationalism.quotient.adj[output.df$corpus == "Small_Europe"])

kruskal.test(nationalism.quotient.adj ~ corpus, data=output.df)


#ridge plots
library(ggridges)
output.df$ratio<-output.df$nat.adj/output.df$int.adj
ggplot(output.df[output.df$ratio < 3,], aes(x = ratio, y = reorder(corpus, ratio), fill=corpus)) +
  geom_density_ridges(scale = 2, show.legend = F) +
  scale_y_discrete(expand = c(0, 0), labels=c("Small_Europe" = "Minor", "German" = "Germany",
                                              "French" = "France", "US" = "US")) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_fill_brewer(palette = 4) +
  #xlim(0, 3) +
  theme_ridges()+
  geom_vline(xintercept = 1, linetype="dashed") +
  xlab("Ratio")+
  ylab("Corpus")


  
#effect size calculator
library(effsize)
cohen.d(output.df$nationalism.quotient.adj[output.df$corpus == "Small_Europe"], output.df$nationalism.quotient.adj[output.df$corpus == "German"])



##################################    OTHER     ###########################################

######### Validate NER across corpora ##############

# extract 100 random passages of 50 tokens in length from each corpus

setwd("~/Research/Minor Literature")

#ingest metadata
meta<-read.csv("min_meta_master.csv", stringsAsFactors = F)

#establish text size window
window<-49

#establish corpus names
corpus.all<-c("French", "German","Small_Europe", "US")

#run
for (m in 1:4){
  
#set corpus
setwd("~/Research/Minor Literature/min_LitBank_Data")

#set corpus
corpus<-corpus.all[m]

#get filenames
filenames<-list.files(corpus)

#create empty table
for (i in 1:length(filenames)){
  
  #setwd
  setwd(paste("~/Research/Minor Literature/min_LitBank_Data/", corpus, sep=""))
  
  #ingest novel
  a<-read.csv(filenames[i], sep="\t", quote = "", stringsAsFactors = F)
  
  #create loop
  for (j in 1:100){
    #take random passage
    start<-sample((min(a$tokenId)+25):(max(a$tokenId)-(window+100)), 1)
    #samp1<-a[start:(start+249),]
    #a<-a[!a$tokenId %in% samp1$tokenId,]
    samp<-a[start:(start+window),]
    
    #annotate samples w novel country ID
    samp$novel_iso<-meta$Country_ISO[gsub(".txt","",meta$ID) == gsub(".tokens","",filenames[i])]
    
    #annotate with country prediction
    
    #add annotation column
    samp$annotated_iso<-numeric(nrow(samp))
    
    #keep only if there is at least one token with
    #a capital letter, not a character name and not the start of a sentence
    
    samp.test<-samp
    
    #remove all words after beginning quotes
    if (length(which(samp.test$inQuotation == "B-QUOTE")) > 0){
      samp.test<-samp.test[-(which(samp.test$inQuotation == "B-QUOTE")+1),]
    }
    
    #remove all quotes
    samp.test<-samp.test[which(!samp.test$pos %in% c("\``", "\''", "\"")),]
    
    #remove all tokens after period, exclamation or question mark
    samp.test<-samp.test[-(which(samp.test$normalizedWord %in% c("!", ".", "?","\""))+1),]
    
    #remove words in all caps
    samp.test<-samp.test[-grep("^[[:upper:]]+$", samp.test$originalWord),]
    
    #remove all characters
    samp.test<-samp.test[-which(samp.test$label == "PER" | samp.test$ner == "PERSON" | samp.test$supersense == "B-noun.person"),]
    
    #remove all pronouns
    samp.test<-samp.test[which(!samp.test$lemma %in% c("she", "he", "it")),]
    
    #remove all non-nouns
    samp.test<-samp.test[grep("NN", samp.test$pos),]
    
    #if there is still a word with capital first letter keep this passage
    
    if (length(grep("^[[:upper:]]", samp.test$originalWord)) > 0){
      break
    }
  }

  #subset by matching on place names
  samp.sub<-samp[samp$label %in% c("GPE", "LOC"), ]
  
  #if > 0 then annotate
  if (nrow(samp.sub) > 0){
    for (j in 1:nrow(samp.sub)){
      
      #set default as NA
      country<-c("NA")
      
      x<-as.character(samp.sub$text[j])
      
      #start with custom lists
      #if x is in custom list1, use that
      if (x %in% custom1$place){
        country<-custom1$iso[which(custom1$place == x)]
      } else if (x %in% custom2$name){
        country<-custom2$iso[which(custom2$name == x)]
        #if x is in continents, use that
      } else if (x %in% continents){
        country<-x
        #if x is in countries use that
      } else if (x %in% countries$normal){
        country<-countries$V9[countries$normal == x]
        #if x is in states use that
      } else if (x %in% states$V2 | x %in% states$V3){
        country<-states$V9[which(states$V2 == x | states$V3 == x)][1]
        #if x is in cities use that
      } else if (x %in% cities$V2 | x %in% cities$V3){
        country<-cities$V9[which(cities$V2 == x | cities$V3 == x)]
        #if there are multuple matches
        if (length(country) > 1){
          sub<-cities[which(cities$V2 == x | cities$V3 == x),]
          #prefer city with highest population; if same city take first
          sub<-sub[which(sub$V15 == max(sub$V15)),]
          country<-sub$V9[1]
        } else {
          country<-country
        }
        #if no match to the above then check regions  
      } else if (x %in% regions$V2 | x %in% regions$V3){
        country<-regions$V9[which(regions$V2 == x | regions$V3 == x)]
        #if there are multiple regions matched
        if (length(country) > 1){
          #and they are all the same region
          if (nlevels(factor(country)) == 1){
            #take the first match
            country<-country[1]
            #otherwise if there are multiple nations
          } else if (nlevels(factor(country)) > 1){
            sub<-regions[regions$V2 == x,]
            #first check if one of the locations is in the novel's home country. prefer this.
            if (samp$novel_iso[j] %in% sub$V9){
              country<-sub$V9[sub$V9 == samp$novel_iso[j]][1]
            } else {
              #otherwise take first country
              country<-country<-sub$V9[1]
            }
          }
        }
      }
      #return(country)
      samp$annotated_iso[samp$tokenId == samp.sub$tokenId[j]]<-country
    }
  }

  #annotate with corpus
  samp$corpus<-meta$Corpus[gsub(".txt","",meta$ID) == gsub(".tokens","",filenames[i])]

  #clean unnecessary columns
  samp<-samp[,colnames(samp) %in% c("tokenId", "originalWord", "novel_iso", "annotated_iso", "corpus")]
  
  #add annotator column
  samp$RA_iso<-c("")
  
  #write to new folder
  #create filename
  validate.filename<-paste(corpus,sprintf("%03d", seq(1:50))[i], "_03", ".csv", sep="")
  #setwd to RA
  setwd("~/Research/Minor Literature/min_Validation/min_Validation_MD_03")
  write.csv(samp, file=validate.filename, row.names = F)
}
}


######### Get word counts per novel ################

setwd("~/Research/Minor Literature/min_LitBank_Data")
filenames<-list.files("Small_Europe")
setwd("~/Research/Minor Literature/min_LitBank_Data/Small_Europe")
word.count.df<-NULL
for (i in 1:length(filenames)){
  w<-read.csv(filenames[i], sep="\t", quote="", stringsAsFactors = F)
  work<-gsub(".tokens", "", filenames[i])
  count<-nrow(w[-grep("[[:punct:]]", w$lemma),])
  temp.df<-data.frame(work, count)
  word.count.df<-rbind(word.count.df, temp.df)
}

#final.wc<-word.count.df
final.wc<-rbind(final.wc, word.count.df)

#attach to metadata
meta.final<-NULL
for (i in 1:nrow(meta)){
  meta.sub<-meta[i,]
  wc.sub<-final.wc[final.wc$work == gsub(".txt","",meta.sub$ID),]
  meta.sub$wordCount<-wc.sub$count
  meta.final<-rbind(meta.final, meta.sub)
}
write.csv(meta.final, file="min_meta_master.csv", row.names = F)


